<?php

include_once "../vendor/autoload.php";
set_time_limit(0);
ini_set('memory_limit', '1500M');
date_default_timezone_set('Asia/Shanghai'); 

use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use QL\QueryList;

class EmailSpider {

    //结果文件计数
	private $mail_no = 1;

    //guzzle对象
    private $client;

    //链接保存数组
    public $href = [];

    //TODO 多进程时进程数量
    private $max = 4;

	public function __construct($url1, $url2, $url3)
    {
        $this->href = [$url1, $url2, $url3];

        $this->client = new Client();
    }

    //匹配是否url
    public function verdictURL($url)
    {
        $host = $_SERVER['HTTP_HOST'];
        if ('http://' !== mb_substr($url, 0,7) && 'https://' !== mb_substr($url, 0,7)) {
            $url = $host . $url;
        }

        $regex = '@(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))@';

        $rs = preg_match($regex, $url);

        if (! $rs) {
            return false;
        }

        try{
            $this->getContent($url);
            return $url;
        } catch(Exception $e){
            return false;
        }
        
    }

    //获取页面内容
	public function getContent($url)
    {
        $content = $this->client->request('GET', $url, ['timeout' => 1, 'verify' => false]);
        return $content->getBody()->getContents();
    }

    //获取页面内href
    public function getHref($content)
    {
    	$rule = [
            'rule' =>  ['a', 'href'],
        ];

        //获取所有链接
    	$rs = QueryList::Query($content, $rule)->getData();
// var_dump($rs);die;
        // var_dump($this->href);die;
    	if (count($rs) == 0) {
            $url = array_shift($this->href);
    		$contents = $this->getContent($url);
    		$this->getHref($contents);
    	}
        $result = [];#var_dump($result);die;
        foreach ($rs as $key => $value) {
            $regs = $this->verdictURL($value['rule']);
            if (false !== $regs) {
                array_push($this->href, $regs);
            }
        }
var_dump($this->href);die;
        if (count($this->href) < 10) {
            $url = array_shift($this->href);
            $this->getHref($url);
        }
    	
    }

    //获取页面内email地址
	public function getEmail($content)
	{
		$time = date('Y-m-d H:i:s', time())."\n\n\n";

		$pattern = "/([a-z0-9\-_\.]+@[a-z0-9]+\.[a-z0-9\-_\.]+)/";
		preg_match_all($pattern,$content,$emailArr);
		// return $emailArr;
		if (!empty($emailArr[0])) {
			file_put_contents('mail'.$this->mail_no, json_encode($emailArr).$time, FILE_APPEND);
		}
	}

    //程序运行
	public function run()
	{
		while(true){

            $url = array_shift($this->href);
			$content = $this->getContent($url);


			if (count($this->href) > 0) {
				$this->getEmail($content);
			} else {
				$this->getHref($content);#var_dump($this->href);
                if (! count($this->href)) {
                    exit('已经没有资源了');
                }
			}

			if (filesize('mail'.$this->mail_no) > 1050000) {
				$this->mail_no ++;
			}
			usleep(50000);
			if ($this->mail_no > 100) {
				exit;
			}

		}
	}



}

$url1 = 'http://www.baidu.com/s?wd=%E7%8E%BB%E7%92%83%20%E9%82%AE%E7%AE%B1&rsv_spt=1&rsv_iqid=0x8efb44aa00089c35&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=1&oq=%25E8%2581%2594%25E7%25B3%25BB%25E9%2582%25AE%25E7%25AE%25B1&rsv_t=79b8tfUgx8aC3ABUN6mm11ngzsuhcz4C9E1%2FdzxSBIHkRE2WbBDwU2CVgBNTznVouxpV&inputT=3235&rsv_pq=e4633883000bbf9a&rsv_sug3=46&rsv_sug2=0&rsv_sug4=3235';
$url2 = 'https://www.baidu.com/s?wd=%E7%95%99%E4%B8%8B%E9%82%AE%E7%AE%B1&rsv_spt=1&rsv_iqid=0xea3f20090003dce2&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=0&oq=%25E7%2595%2599%25E4%25B8%258B%25E9%2582%25AE%25E7%25AE%25B1&rsv_t=26fckRw%2FIwe5jI7ZxfvlyFHV4B2bR3AFo3lDEKsUrHTj0b7ZZPwVTVis%2BX%2BmO3U9R6kA&rsv_pq=a84bec7d00067f34&rsv_sug=1';
$url3 = 'https://www.baidu.com/s?wd=%E5%8F%91%E6%88%91%E4%B8%80%E4%BB%BD%E8%B0%A2%E8%B0%A2&rsv_spt=1&rsv_iqid=0xea3f20090003dce2&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=0&oq=%25E5%258F%2591%25E6%2588%2591%25E4%25B8%2580%25E4%25BB%25BD%25E8%25B0%25A2%25E8%25B0%25A2&rsv_t=f4f3M38npfG6jzZ44ELPDHpPRmUcER%2BKbWul2KbqGAKZVnx8LegncmQRnTDT2m3SSiDL&rsv_pq=90718ae70005d391';

$spider = new EmailSpider($url1, $url2, $url3);
$ar = array_shift($spider->href);#get 1 url

$content = $spider->getContent($ar);#get url page content

$href = $spider->getHref($content);

var_dump($href);
// $spider->run();
